{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_string(string):\n",
    "    string = re.sub('[^A-Za-z0-9\\-\\/ ]+', ' ', string).split()\n",
    "    return [y.strip() for y in string]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_raw(filename):\n",
    "    with open(filename, 'r') as fopen:\n",
    "        entities = fopen.read()\n",
    "    soup = BeautifulSoup(entities, 'html.parser')\n",
    "    inside_tag = ''\n",
    "    texts, labels = [], []\n",
    "    for sentence in soup.prettify().split('\\n'):\n",
    "        if len(inside_tag):\n",
    "            splitted = process_string(sentence)\n",
    "            texts += splitted\n",
    "            labels += [inside_tag] * len(splitted)\n",
    "            inside_tag = ''\n",
    "        else:\n",
    "            if not sentence.find('</'):\n",
    "                pass\n",
    "            elif not sentence.find('<'):\n",
    "                inside_tag = sentence.split('>')[0][1:]\n",
    "            else:\n",
    "                splitted = process_string(sentence)\n",
    "                texts += splitted\n",
    "                labels += ['OTHER'] * len(splitted)\n",
    "    assert (len(texts)==len(labels)), \"length texts and labels are not same\"\n",
    "    print('len texts and labels: ', len(texts))\n",
    "    return texts,labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len texts and labels:  34012\n"
     ]
    }
   ],
   "source": [
    "train_texts, train_labels = parse_raw('data_train.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len texts and labels:  9249\n"
     ]
    }
   ],
   "source": [
    "test_texts, test_labels = parse_raw('data_test.txt')\n",
    "train_texts += test_texts\n",
    "train_labels += test_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['OTHER', 'location', 'organization', 'person', 'quantity', 'time'],\n",
       "       dtype='<U12'), array([35613,  1536,  1592,  2358,  1336,   826]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_labels,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('entities-bm-normalize-v3.txt','r') as fopen:\n",
    "    entities_bm = fopen.read().split('\\n')[:-1]\n",
    "entities_bm = [i.split() for i in entities_bm]\n",
    "entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'KN'\n",
      "'KA'\n"
     ]
    }
   ],
   "source": [
    "replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',\n",
    "             'EVENT':'OTHER','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}\n",
    "for i in entities_bm:\n",
    "    try:\n",
    "        string = process_string(i[0])\n",
    "        if len(string):\n",
    "            train_labels.append(replace_by[i[1]])\n",
    "            train_texts.append(process_string(i[0])[0])  \n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "assert (len(train_texts)==len(train_labels)), \"length texts and labels are not same\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['OTHER', 'law', 'location', 'organization', 'person', 'quantity',\n",
       "        'time'], dtype='<U12'),\n",
       " array([47406,   107,  2010,  2435,  3913,  1336,  1240]))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_labels,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(58447, 17758)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target = LabelEncoder().fit_transform(train_labels)\n",
    "bow_chars = CountVectorizer(ngram_range=(1, 4), analyzer='char').fit(train_texts)\n",
    "vectors = bow_chars.transform(train_texts)\n",
    "vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)\n",
    "del vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation-mlogloss:1.71443\n",
      "Will train until validation-mlogloss hasn't improved in 100 rounds.\n",
      "[50]\tvalidation-mlogloss:0.454539\n",
      "[100]\tvalidation-mlogloss:0.402853\n",
      "[150]\tvalidation-mlogloss:0.383168\n",
      "[200]\tvalidation-mlogloss:0.371301\n",
      "[250]\tvalidation-mlogloss:0.362994\n",
      "[300]\tvalidation-mlogloss:0.356895\n",
      "[350]\tvalidation-mlogloss:0.351588\n",
      "[400]\tvalidation-mlogloss:0.347524\n",
      "[450]\tvalidation-mlogloss:0.344712\n",
      "[500]\tvalidation-mlogloss:0.342025\n",
      "[550]\tvalidation-mlogloss:0.339598\n",
      "[600]\tvalidation-mlogloss:0.337423\n",
      "[650]\tvalidation-mlogloss:0.335829\n",
      "[700]\tvalidation-mlogloss:0.334537\n",
      "[750]\tvalidation-mlogloss:0.333315\n",
      "[800]\tvalidation-mlogloss:0.332229\n",
      "[850]\tvalidation-mlogloss:0.331261\n",
      "[900]\tvalidation-mlogloss:0.330584\n",
      "[950]\tvalidation-mlogloss:0.329828\n",
      "[1000]\tvalidation-mlogloss:0.329271\n",
      "[1050]\tvalidation-mlogloss:0.328545\n",
      "[1100]\tvalidation-mlogloss:0.32779\n",
      "[1150]\tvalidation-mlogloss:0.327664\n",
      "[1200]\tvalidation-mlogloss:0.327273\n",
      "[1250]\tvalidation-mlogloss:0.326907\n",
      "[1300]\tvalidation-mlogloss:0.326394\n",
      "[1350]\tvalidation-mlogloss:0.326084\n",
      "[1400]\tvalidation-mlogloss:0.325755\n",
      "[1450]\tvalidation-mlogloss:0.325493\n",
      "[1500]\tvalidation-mlogloss:0.325221\n",
      "[1550]\tvalidation-mlogloss:0.324952\n",
      "[1600]\tvalidation-mlogloss:0.324784\n",
      "[1650]\tvalidation-mlogloss:0.324688\n",
      "[1700]\tvalidation-mlogloss:0.324377\n",
      "[1750]\tvalidation-mlogloss:0.324172\n",
      "[1800]\tvalidation-mlogloss:0.323985\n",
      "[1850]\tvalidation-mlogloss:0.323854\n",
      "[1900]\tvalidation-mlogloss:0.323784\n",
      "[1950]\tvalidation-mlogloss:0.323488\n",
      "[2000]\tvalidation-mlogloss:0.323354\n",
      "[2050]\tvalidation-mlogloss:0.3232\n",
      "[2100]\tvalidation-mlogloss:0.323201\n",
      "[2150]\tvalidation-mlogloss:0.322986\n",
      "[2200]\tvalidation-mlogloss:0.322805\n",
      "[2250]\tvalidation-mlogloss:0.322691\n",
      "[2300]\tvalidation-mlogloss:0.322681\n",
      "[2350]\tvalidation-mlogloss:0.322679\n",
      "Stopping. Best iteration:\n",
      "[2255]\tvalidation-mlogloss:0.322645\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_d = xgb.DMatrix(train_X, train_Y)\n",
    "test_d = xgb.DMatrix(test_X, test_Y)\n",
    "params_xgd = {\n",
    "    'min_child_weight': 10.0,\n",
    "    'max_depth': 14,\n",
    "    'objective': 'multi:softprob',\n",
    "    'max_delta_step': 1.8,\n",
    "    'num_class': len(np.unique(target)),\n",
    "    'colsample_bytree': 0.4,\n",
    "    'subsample': 0.8,\n",
    "    'learning_rate': 0.1,\n",
    "    'gamma': 0.65,\n",
    "    'silent':True,\n",
    "    'eval_metric': 'mlogloss'\n",
    "}\n",
    "model = xgb.train(params_xgd, train_d, 100000, evals=[(test_d, 'validation')], \n",
    "                  early_stopping_rounds=100, verbose_eval=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       OTHER       0.93      0.97      0.95      9505\n",
      "         law       1.00      0.43      0.60        14\n",
      "    location       0.68      0.61      0.64       389\n",
      "organization       0.63      0.49      0.55       490\n",
      "      person       0.83      0.71      0.76       779\n",
      "    quantity       0.63      0.61      0.62       251\n",
      "        time       0.84      0.60      0.70       262\n",
      "\n",
      " avg / total       0.90      0.90      0.90     11690\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = np.unique(train_labels)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "news = 'ikat penyedia perkhidmatan jalur lebar Telekom Malaysia (TM) perlu mencari jalan penyelesaian bagi meningkatkan akses capaian Internet ke seluruh negara, kata Menteri Komunikasi dan Multimedia, Gobind Singh Deo. Beliau berkata menjadi dasar kerajaan untuk membekalkan akses Internet jalur lebar kepada semua dan memberi penekanan kepada kualiti perkhidmatan yang terbaik. \"Dasar kerajaan untuk bekalkan akses kepada semua bukan sekadar pembekalan sahaja tetapi beri penekanan kepada kualiti perkhidmatan yang baik dan dapat bersaing dengan negara lain pada tahap antarabangsa,\" kata Gobind Singh menerusi catatan di laman rasmi Twitter beliau, malam tadi. Beliau berkata demikian sebagai respons terhadap aduan beberapa pengguna Twitter berhubung akses Internet yang masih tidak stabil serta harga yang tidak berpatutan di beberapa lokasi di seluruh negara.'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('xgb-entities.pkl','wb') as fopen:\n",
    "    pickle.dump(model,fopen)\n",
    "with open('xgb-bow-entities.pkl','wb') as fopen:\n",
    "    pickle.dump(bow_chars,fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
